@inproceedings{zhang2018cross,
  title={Cross-modal and hierarchical modeling of video and text},
  author={Zhang, Bowen and Hu, Hexiang and Sha, Fei},
  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
  pages={374--390},
  year={2018}
}